{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "5LCzoheJKW8X",
    "outputId": "2eddc9d2-b95d-4868-dfd8-65a0b186fa60"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.7/41.7 kB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m163.9/163.9 kB\u001b[0m \u001b[31m11.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
      "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
      "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m26.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.2/11.2 MB\u001b[0m \u001b[31m60.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m34.8/34.8 MB\u001b[0m \u001b[31m18.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.1/4.1 MB\u001b[0m \u001b[31m86.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m17.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m363.4/363.4 MB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.8/13.8 MB\u001b[0m \u001b[31m68.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.6/24.6 MB\u001b[0m \u001b[31m36.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m883.7/883.7 kB\u001b[0m \u001b[31m50.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m664.8/664.8 MB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.5/211.5 MB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.3/56.3 MB\u001b[0m \u001b[31m12.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m127.9/127.9 MB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.5/207.5 MB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m42.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m866.1/866.1 kB\u001b[0m \u001b[31m63.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m135.0/135.0 kB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.1/45.1 kB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m76.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25h  Building wheel for FlagEmbedding (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
      "  Building wheel for warc3-wet-clueweb09 (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
      "  Building wheel for cbor (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
      "gcsfs 2025.3.0 requires fsspec==2025.3.0, but you have fsspec 2024.9.0 which is incompatible.\u001b[0m\u001b[31m\n",
      "\u001b[0m"
     ]
    }
   ],
   "source": [
    "!pip install -U transformers datasets==3.2.0 openai lancedb lance FlagEmbedding \"tantivy>=0.20.1\" -qq\n",
    "\n",
    "# NOTE: If there is an import error, restart and run the notebook again"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "vP6d6JUShgqo"
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import lancedb\n",
    "import re\n",
    "import pandas as pd\n",
    "import random\n",
    "\n",
    "from datasets import load_dataset\n",
    "\n",
    "import torch\n",
    "import gc\n",
    "\n",
    "import lance\n",
    "\n",
    "\n",
    "import os\n",
    "\n",
    "import lancedb\n",
    "import openai\n",
    "from lancedb.embeddings import get_registry\n",
    "from lancedb.pydantic import LanceModel, Vector\n",
    "\n",
    "\n",
    "os.environ[\"OPENAI_API_KEY\"] = \"sk-proj-....\"\n",
    "\n",
    "\n",
    "embeddings = get_registry().get(\"openai\").create()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "8eKRYd2F7v5n"
   },
   "source": [
    "# Load `Chunks` of data from [BeIR Dataset](https://huggingface.co/datasets/BeIR/scidocs)\n",
    "\n",
    "Note: This is a dataset built specially for retrieval tasks to see how good your search is working"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 507,
     "referenced_widgets": [
      "f1cfd2a04cea4a97a4697e44b47d8f5c",
      "bba43bfe1e284cfd9c959e7d0a078bc8",
      "306070aaed3c4681916785d993387ea2",
      "c57b8c9da33e488da5791f8f9aaf4115",
      "7679689d29ba4c719a415494415bbccc",
      "06311f49979047d99861c232b521e335",
      "5c2fb58062d144a59957bb8edbe1809f",
      "49f44448572d4b64a7fcf24b5aa603e9",
      "59e641456aea4bdd89838e46a61a0dd1",
      "85abbb024f624c86a2e1d7a7d9b6c27f",
      "466e746e993f48328cb56d1bd6938975",
      "f4869f44a44d436db7128d3d7518fb3d",
      "4bd3383d8ce548c2a449a0c8d9078bd5",
      "d16d7e3679004ea89f4a57eae7ec1c55",
      "29f21c0ef9ed4d1fb92b1106d2660836",
      "0d13e445976c414da79501d38b152509",
      "1f4be27377c44b4e9d501db532f20b36",
      "507017a7b63549e9beeed077b9ef59bc",
      "c9d6147ba1ce4832b202154477b96f2b",
      "613073bd7e1e41d4acf5e10ad361a90b",
      "00c544bdd6454297bec4671f156c167d",
      "46501f3a3f124fa8826c988de9a76188",
      "ae9100ca06174190a29fa408da0496ea",
      "fd4e62610a244f6b9228ed84b9a09ec6",
      "d9ed9871b4954f9c929b8436d1fbabea",
      "6f5e8e9d6cda4d78a22528ac826d6cbc",
      "20756effb6cf4f0d8e6ec402d3750d47",
      "9b689e9c36a14aab821553cabf9e7125",
      "268533d1adb0445f9f72e7d2b61706b1",
      "f98d29deac43421897014584f379342c",
      "99b03ee3735c4715be07c9dff052d3b3",
      "faea41ba3d56465c935151c5bbbb1729",
      "2edfda76ffcf49eda06038472d59c99c",
      "8c1b8fe6aea849fa9018b1e36d90e379",
      "c7c9aa62e18b49a1b6107a467e80d89d",
      "e58f98de451f4cc3beca8059c1fbd729",
      "101bef84d63447d788a43883bf24ea5e",
      "309493a98ccf44428b93df0b08d2007f",
      "a003d8f3fa714876aaccba8e81b3d727",
      "1dd71a85d3644081bce0276394a0ea2d",
      "bbce05361ec344a3a1bb54c1c62d64ed",
      "dd9e64e3f3b547448dc7175fda0ddae5",
      "7ff9d83da6c74f73a71bed4a85e07ae7",
      "5547e31d78ca4df08a671a88a416f66d",
      "2cbe40bd698d4758a9923a11d51487c7",
      "c621112f597f4e4f986b0f23fb27affb",
      "6eb7cc3f25ad4efe8d2d44b1c31e0c23",
      "2736de8b87e14272967fe43f5393669d",
      "bb8006adfb5b4564ab7a9ebecf658b52",
      "7c97df9dc34042519d1f7e8a643d4971",
      "8279576ce0b5434db0e1881a378b34df",
      "a94618a823fa48abbad2c80ce90ac41c",
      "51ecfc4e544c4a62925a9271f5bd8261",
      "7a4ee4ac659149caaf93ad478b9c3631",
      "0099f15e0ae349e29520cc4cbc40d515",
      "e42b9b3860564cbd930a9ecf09a158a3",
      "189a7706f9d14d10b5db4732174e950f",
      "07d847c0a15d452d8d5e12aeca9ede57",
      "a281fe1d5d664b6bb7e5aa79207f982d",
      "f3a936f554674591b5ef101e8d1267a7",
      "66c2c13d0201442bbefc9958fdf42ffb",
      "fef8e38388cf4339acd9c065b71cb0c7",
      "d7d4d6f77fc345d398820b0a6fe2a820",
      "86afd904d9214726a2e338b3ec9a0c20",
      "1db229e6a89f48bdb44499b78f8d423b",
      "09d27fa090854a6f9fed806f456169bd"
     ]
    },
    "id": "l0ezDr7suAf_",
    "outputId": "9f525f83-382c-4217-cfe0-e3ac47664be2"
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f1cfd2a04cea4a97a4697e44b47d8f5c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "README.md: 0.00B [00:00, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f4869f44a44d436db7128d3d7518fb3d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "scidocs.py: 0.00B [00:00, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ae9100ca06174190a29fa408da0496ea",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0000.parquet:   0%|          | 0.00/93.3k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8c1b8fe6aea849fa9018b1e36d90e379",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating queries split:   0%|          | 0/1000 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2cbe40bd698d4758a9923a11d51487c7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0000.parquet:   0%|          | 0.00/19.0M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e42b9b3860564cbd930a9ecf09a158a3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating corpus split:   0%|          | 0/25657 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipython-input-4-751017049.py:16: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  docs[\"num_words\"] = docs[\"text\"].apply(\n"
     ]
    },
    {
     "data": {
      "application/vnd.google.colaboratory.intrinsic+json": {
       "summary": "{\n  \"name\": \"docs\",\n  \"rows\": 5,\n  \"fields\": [\n    {\n      \"column\": \"_id\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 5,\n        \"samples\": [\n          \"105a0b3826710356e218685f87b20fe39c64c706\",\n          \"30f46fdfe1fdab60bdecaa27aaa94526dfd87ac1\",\n          \"abd0478f1572d8ecdca4738df3e4b3bd116d7b42\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"title\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 5,\n        \"samples\": [\n          \"Opinion observer: analyzing and comparing opinions on the Web\",\n          \"Real-Time 3D Reconstruction and 6-DoF Tracking with an Event Camera\",\n          \"Dispositional Factors in Internet Use: Personality Versus Cognitive Style\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"text\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 5,\n        \"samples\": [\n          \"The Web has become an excellent source for gathering consumer opinions. There are now numerous Web sites containing such opinions, e.g., customer reviews of products, forums, discussion groups, and blogs. This paper focuses on online customer reviews of products. It makes two contributions. First, it proposes a novel framework for analyzing and comparing consumer opinions of competing products. A prototype system called Opinion Observer is also implemented. The system is such that with a single glance of its visualization, the user is able to clearly see the strengths and weaknesses of each product in the minds of consumers in terms of various product features. This comparison is useful to both potential customers and product manufacturers. For a potential customer, he/she can see a visual side-by-side and feature-by-feature comparison of consumer opinions on these products, which helps him/her to decide which product to buy. For a product manufacturer, the comparison enables it to easily gather marketing intelligence and product benchmarking information. Second, a new technique based on language pattern mining is proposed to extract product features from Pros and Cons in a particular type of reviews. Such features form the basis for the above comparison. Experimental results show that the technique is highly effective and outperform existing methods significantly.\",\n          \"We propose a method which can perform real-time 3D reconstruction from a single hand-held event camera with no additional sensing, and works in unstructured scenes of which it has no prior knowledge. It is based on three decoupled probabilistic filters, each estimating 6-DoF camera motion, scene logarithmic (log) intensity gradient and scene inverse depth relative to a keyframe, and we build a real-time graph of these to track and model over an extended local workspace. We also upgrade the gradient estimate for each keyframe into an intensity image, allowing us to recover a real-time video-like intensity sequence with spatial and temporal super-resolution from the low bit-rate input event stream. To the best of our knowledge, this is the first algorithm provably able to track a general 6D motion along with reconstruction of arbitrary structure including its intensity and the reconstruction of grayscale video that exclusively relies on event camera data.\",\n          \"This study directly tests the effect of personality and cognitive style on three measures of Internet use. The results support the use of personality\\u2014but not cognitive style\\u2014as an antecedent variable. After controlling for computer anxiety, selfefficacy, and gender, including the \\u201cBig Five\\u201d personality factors in the analysis significantly adds to the predictive capabilities of the dependent variables. Including cognitive style does not. The results are discussed in terms of the role of personality and cognitive style in models of technology adoption and use.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"num_words\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 55,\n        \"min\": 83,\n        \"max\": 212,\n        \"num_unique_values\": 5,\n        \"samples\": [\n          208,\n          150,\n          83\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
       "type": "dataframe"
      },
      "text/html": [
       "\n",
       "  <div id=\"df-02b4e04a-2cb5-429a-9a79-9f15b7b21903\" class=\"colab-df-container\">\n",
       "    <div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>_id</th>\n",
       "      <th>title</th>\n",
       "      <th>text</th>\n",
       "      <th>num_words</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>506d4ca228f81715946ed1ad8d9205fad20fddfe</td>\n",
       "      <td>Measuring pictorial balance perception at firs...</td>\n",
       "      <td>According to art theory, pictorial balance act...</td>\n",
       "      <td>212</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>105a0b3826710356e218685f87b20fe39c64c706</td>\n",
       "      <td>Opinion observer: analyzing and comparing opin...</td>\n",
       "      <td>The Web has become an excellent source for gat...</td>\n",
       "      <td>208</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>abd0478f1572d8ecdca4738df3e4b3bd116d7b42</td>\n",
       "      <td>Dispositional Factors in Internet Use: Persona...</td>\n",
       "      <td>This study directly tests the effect of person...</td>\n",
       "      <td>83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>e645cbd3aaeab56858f1e752677b8792d7377d14</td>\n",
       "      <td>BCSAT : A Benchmark Corpus for Sentiment Analy...</td>\n",
       "      <td>The presented work aims at generating a system...</td>\n",
       "      <td>126</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54</th>\n",
       "      <td>30f46fdfe1fdab60bdecaa27aaa94526dfd87ac1</td>\n",
       "      <td>Real-Time 3D Reconstruction and 6-DoF Tracking...</td>\n",
       "      <td>We propose a method which can perform real-tim...</td>\n",
       "      <td>150</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>\n",
       "    <div class=\"colab-df-buttons\">\n",
       "\n",
       "  <div class=\"colab-df-container\">\n",
       "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-02b4e04a-2cb5-429a-9a79-9f15b7b21903')\"\n",
       "            title=\"Convert this dataframe to an interactive table.\"\n",
       "            style=\"display:none;\">\n",
       "\n",
       "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
       "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
       "  </svg>\n",
       "    </button>\n",
       "\n",
       "  <style>\n",
       "    .colab-df-container {\n",
       "      display:flex;\n",
       "      gap: 12px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert {\n",
       "      background-color: #E8F0FE;\n",
       "      border: none;\n",
       "      border-radius: 50%;\n",
       "      cursor: pointer;\n",
       "      display: none;\n",
       "      fill: #1967D2;\n",
       "      height: 32px;\n",
       "      padding: 0 0 0 0;\n",
       "      width: 32px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert:hover {\n",
       "      background-color: #E2EBFA;\n",
       "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
       "      fill: #174EA6;\n",
       "    }\n",
       "\n",
       "    .colab-df-buttons div {\n",
       "      margin-bottom: 4px;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert {\n",
       "      background-color: #3B4455;\n",
       "      fill: #D2E3FC;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert:hover {\n",
       "      background-color: #434B5C;\n",
       "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
       "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
       "      fill: #FFFFFF;\n",
       "    }\n",
       "  </style>\n",
       "\n",
       "    <script>\n",
       "      const buttonEl =\n",
       "        document.querySelector('#df-02b4e04a-2cb5-429a-9a79-9f15b7b21903 button.colab-df-convert');\n",
       "      buttonEl.style.display =\n",
       "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
       "\n",
       "      async function convertToInteractive(key) {\n",
       "        const element = document.querySelector('#df-02b4e04a-2cb5-429a-9a79-9f15b7b21903');\n",
       "        const dataTable =\n",
       "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
       "                                                    [key], {});\n",
       "        if (!dataTable) return;\n",
       "\n",
       "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
       "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
       "          + ' to learn more about interactive tables.';\n",
       "        element.innerHTML = '';\n",
       "        dataTable['output_type'] = 'display_data';\n",
       "        await google.colab.output.renderOutput(dataTable, element);\n",
       "        const docLink = document.createElement('div');\n",
       "        docLink.innerHTML = docLinkHtml;\n",
       "        element.appendChild(docLink);\n",
       "      }\n",
       "    </script>\n",
       "  </div>\n",
       "\n",
       "\n",
       "    <div id=\"df-4f30e616-0f86-411f-ae6e-0c63095f150f\">\n",
       "      <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-4f30e616-0f86-411f-ae6e-0c63095f150f')\"\n",
       "                title=\"Suggest charts\"\n",
       "                style=\"display:none;\">\n",
       "\n",
       "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
       "     width=\"24px\">\n",
       "    <g>\n",
       "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
       "    </g>\n",
       "</svg>\n",
       "      </button>\n",
       "\n",
       "<style>\n",
       "  .colab-df-quickchart {\n",
       "      --bg-color: #E8F0FE;\n",
       "      --fill-color: #1967D2;\n",
       "      --hover-bg-color: #E2EBFA;\n",
       "      --hover-fill-color: #174EA6;\n",
       "      --disabled-fill-color: #AAA;\n",
       "      --disabled-bg-color: #DDD;\n",
       "  }\n",
       "\n",
       "  [theme=dark] .colab-df-quickchart {\n",
       "      --bg-color: #3B4455;\n",
       "      --fill-color: #D2E3FC;\n",
       "      --hover-bg-color: #434B5C;\n",
       "      --hover-fill-color: #FFFFFF;\n",
       "      --disabled-bg-color: #3B4455;\n",
       "      --disabled-fill-color: #666;\n",
       "  }\n",
       "\n",
       "  .colab-df-quickchart {\n",
       "    background-color: var(--bg-color);\n",
       "    border: none;\n",
       "    border-radius: 50%;\n",
       "    cursor: pointer;\n",
       "    display: none;\n",
       "    fill: var(--fill-color);\n",
       "    height: 32px;\n",
       "    padding: 0;\n",
       "    width: 32px;\n",
       "  }\n",
       "\n",
       "  .colab-df-quickchart:hover {\n",
       "    background-color: var(--hover-bg-color);\n",
       "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
       "    fill: var(--button-hover-fill-color);\n",
       "  }\n",
       "\n",
       "  .colab-df-quickchart-complete:disabled,\n",
       "  .colab-df-quickchart-complete:disabled:hover {\n",
       "    background-color: var(--disabled-bg-color);\n",
       "    fill: var(--disabled-fill-color);\n",
       "    box-shadow: none;\n",
       "  }\n",
       "\n",
       "  .colab-df-spinner {\n",
       "    border: 2px solid var(--fill-color);\n",
       "    border-color: transparent;\n",
       "    border-bottom-color: var(--fill-color);\n",
       "    animation:\n",
       "      spin 1s steps(1) infinite;\n",
       "  }\n",
       "\n",
       "  @keyframes spin {\n",
       "    0% {\n",
       "      border-color: transparent;\n",
       "      border-bottom-color: var(--fill-color);\n",
       "      border-left-color: var(--fill-color);\n",
       "    }\n",
       "    20% {\n",
       "      border-color: transparent;\n",
       "      border-left-color: var(--fill-color);\n",
       "      border-top-color: var(--fill-color);\n",
       "    }\n",
       "    30% {\n",
       "      border-color: transparent;\n",
       "      border-left-color: var(--fill-color);\n",
       "      border-top-color: var(--fill-color);\n",
       "      border-right-color: var(--fill-color);\n",
       "    }\n",
       "    40% {\n",
       "      border-color: transparent;\n",
       "      border-right-color: var(--fill-color);\n",
       "      border-top-color: var(--fill-color);\n",
       "    }\n",
       "    60% {\n",
       "      border-color: transparent;\n",
       "      border-right-color: var(--fill-color);\n",
       "    }\n",
       "    80% {\n",
       "      border-color: transparent;\n",
       "      border-right-color: var(--fill-color);\n",
       "      border-bottom-color: var(--fill-color);\n",
       "    }\n",
       "    90% {\n",
       "      border-color: transparent;\n",
       "      border-bottom-color: var(--fill-color);\n",
       "    }\n",
       "  }\n",
       "</style>\n",
       "\n",
       "      <script>\n",
       "        async function quickchart(key) {\n",
       "          const quickchartButtonEl =\n",
       "            document.querySelector('#' + key + ' button');\n",
       "          quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
       "          quickchartButtonEl.classList.add('colab-df-spinner');\n",
       "          try {\n",
       "            const charts = await google.colab.kernel.invokeFunction(\n",
       "                'suggestCharts', [key], {});\n",
       "          } catch (error) {\n",
       "            console.error('Error during call to suggestCharts:', error);\n",
       "          }\n",
       "          quickchartButtonEl.classList.remove('colab-df-spinner');\n",
       "          quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
       "        }\n",
       "        (() => {\n",
       "          let quickchartButtonEl =\n",
       "            document.querySelector('#df-4f30e616-0f86-411f-ae6e-0c63095f150f button');\n",
       "          quickchartButtonEl.style.display =\n",
       "            google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
       "        })();\n",
       "      </script>\n",
       "    </div>\n",
       "\n",
       "    </div>\n",
       "  </div>\n"
      ],
      "text/plain": [
       "                                         _id  \\\n",
       "15  506d4ca228f81715946ed1ad8d9205fad20fddfe   \n",
       "34  105a0b3826710356e218685f87b20fe39c64c706   \n",
       "49  abd0478f1572d8ecdca4738df3e4b3bd116d7b42   \n",
       "46  e645cbd3aaeab56858f1e752677b8792d7377d14   \n",
       "54  30f46fdfe1fdab60bdecaa27aaa94526dfd87ac1   \n",
       "\n",
       "                                                title  \\\n",
       "15  Measuring pictorial balance perception at firs...   \n",
       "34  Opinion observer: analyzing and comparing opin...   \n",
       "49  Dispositional Factors in Internet Use: Persona...   \n",
       "46  BCSAT : A Benchmark Corpus for Sentiment Analy...   \n",
       "54  Real-Time 3D Reconstruction and 6-DoF Tracking...   \n",
       "\n",
       "                                                 text  num_words  \n",
       "15  According to art theory, pictorial balance act...        212  \n",
       "34  The Web has become an excellent source for gat...        208  \n",
       "49  This study directly tests the effect of person...         83  \n",
       "46  The presented work aims at generating a system...        126  \n",
       "54  We propose a method which can perform real-tim...        150  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "from datasets import config\n",
    "\n",
    "\n",
    "queries = load_dataset(\"BeIR/scidocs\", \"queries\")[\"queries\"].to_pandas()\n",
    "full_docs = (\n",
    "    load_dataset(\"BeIR/scidocs\", \"corpus\")[\"corpus\"].to_pandas().dropna(subset=\"text\")\n",
    ")\n",
    "\n",
    "docs = full_docs.head(64)  # just random samples for faster embed demo\n",
    "docs[\"num_words\"] = docs[\"text\"].apply(\n",
    "    lambda x: len(x.split())\n",
    ")  # Insert some Metadata for a more \"HYBRID\" search\n",
    "docs.sample(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "HJf8xZmX8VJC"
   },
   "source": [
    "# Build New Table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "id": "GkhFiOZAwVWw"
   },
   "outputs": [],
   "source": [
    "!rm -rf ./db"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "id": "5aljyqpUiViE"
   },
   "outputs": [],
   "source": [
    "class Documents(LanceModel):\n",
    "    vector: Vector(embeddings.ndims()) = embeddings.VectorField()\n",
    "    text: str = embeddings.SourceField()\n",
    "    title: str\n",
    "    num_words: int\n",
    "\n",
    "\n",
    "data = docs.apply(\n",
    "    lambda row: {\n",
    "        \"title\": row[\"title\"],\n",
    "        \"text\": row[\"text\"],\n",
    "        \"num_words\": row[\"num_words\"],\n",
    "    },\n",
    "    axis=1,\n",
    ").values.tolist()\n",
    "\n",
    "db = lancedb.connect(\"./db\")\n",
    "table = db.create_table(\"documents\", schema=Documents)\n",
    "\n",
    "table.add(data)  # ingest docs with auto-vectorization\n",
    "table.create_fts_index(\"text\")  # Create a fts index before the hybrid search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 206
    },
    "id": "OS-soek-yYm4",
    "outputId": "12f297e0-9dad-4c7a-9eba-be93ffc7bef2"
   },
   "outputs": [
    {
     "data": {
      "application/vnd.google.colaboratory.intrinsic+json": {
       "summary": "{\n  \"name\": \")\",\n  \"rows\": 5,\n  \"fields\": [\n    {\n      \"column\": \"vector\",\n      \"properties\": {\n        \"dtype\": \"object\",\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"text\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 5,\n        \"samples\": [\n          \"Iron, the most ubiquitous of the transition metals and the fourth most plentiful element in the Earth's crust, is the structural backbone of our modern infrastructure. It is therefore ironic that as a nanoparticle, iron has been somewhat neglected in favor of its own oxides, as well as other metals such as cobalt, nickel, gold, and platinum. This is unfortunate, but understandable. Iron's reactivity is important in macroscopic applications (particularly rusting), but is a dominant concern at the nanoscale. Finely divided iron has long been known to be pyrophoric, which is a major reason that iron nanoparticles have not been more fully studied to date. This extreme reactivity has traditionally made iron nanoparticles difficult to study and inconvenient for practical applications. Iron however has a great deal to offer at the nanoscale, including very potent magnetic and catalytic properties. Recent work has begun to take advantage of iron's potential, and work in this field appears to be blossoming.\",\n          \"We introduce a long short-term memory recurrent neural network (LSTM-RNN) approach for real-time facial animation, which automatically estimates head rotation and facial action unit activations of a speaker from just her speech. Specifically, the time-varying contextual non-linear mapping between audio stream and visual facial movements is realized by training a LSTM neural network on a large audio-visual data corpus. In this work, we extract a set of acoustic features from input audio, including Mel-scaled spectrogram, Mel frequency cepstral coefficients and chromagram that can effectively represent both contextual progression and emotional intensity of the speech. Output facial movements are characterized by 3D rotation and blending expression weights of a blendshape model, which can be used directly for animation. Thus, even though our model does not explicitly predict the affective states of the target speaker, her emotional manifestation is recreated via expression weights of the face model. Experiments on an evaluation dataset of different speakers across a wide range of affective states demonstrate promising results of our approach in real-time speech-driven facial animation.\",\n          \"Mobility prediction is one of the most essential issues that need to be explored for mobility management in mobile computing systems. In this paper, we propose a new algorithm for predicting the next inter-cell movement of a mobile user in a Personal Communication Systems network. In the first phase of our threephase algorithm, user mobility patterns are mined from the history of mobile user trajectories. In the second phase, mobility rules are extracted from these patterns, and in the last phase, mobility predictions are accomplished by using these rules. The performance of the proposed algorithm is evaluated through simulation as compared to two other prediction methods. The performance results obtained in terms of Precision and Recall indicate that our method can make more accurate predictions than the other methods. 2004 Elsevier B.V. All rights reserved.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"title\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 5,\n        \"samples\": [\n          \"Synthesis, properties, and applications of iron nanoparticles.\",\n          \"Speech-driven 3 D Facial Animation with Implicit Emotional Awareness : A Deep Learning Approach\",\n          \"A data mining approach for location prediction in mobile environments\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"num_words\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 40,\n        \"min\": 100,\n        \"max\": 210,\n        \"num_unique_values\": 5,\n        \"samples\": [\n          158,\n          171,\n          135\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"_score\",\n      \"properties\": {\n        \"dtype\": \"float32\",\n        \"num_unique_values\": 5,\n        \"samples\": [\n          47.56113815307617,\n          27.97134780883789,\n          43.39748764038086\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
       "type": "dataframe"
      },
      "text/html": [
       "\n",
       "  <div id=\"df-a69f5ed7-820c-470a-bb2c-961f795af416\" class=\"colab-df-container\">\n",
       "    <div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>vector</th>\n",
       "      <th>text</th>\n",
       "      <th>title</th>\n",
       "      <th>num_words</th>\n",
       "      <th>_score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[-0.012233193, -0.0030392595, 0.022818677, -0....</td>\n",
       "      <td>Sociological and technical difficulties, such ...</td>\n",
       "      <td>Hipikat: a project memory for software develop...</td>\n",
       "      <td>210</td>\n",
       "      <td>48.126713</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[-0.00843631, -0.013696598, 0.0075827544, -0.0...</td>\n",
       "      <td>Iron, the most ubiquitous of the transition me...</td>\n",
       "      <td>Synthesis, properties, and applications of iro...</td>\n",
       "      <td>158</td>\n",
       "      <td>47.561138</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[0.005474513, -0.0017037347, 0.009329896, -0.0...</td>\n",
       "      <td>Mobility prediction is one of the most essenti...</td>\n",
       "      <td>A data mining approach for location prediction...</td>\n",
       "      <td>135</td>\n",
       "      <td>43.397488</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[-0.010289601, -0.0014841412, -0.0050512585, -...</td>\n",
       "      <td>It's not surprisingly when entering this site ...</td>\n",
       "      <td>Genetic Fuzzy Systems - Evolutionary Tuning an...</td>\n",
       "      <td>100</td>\n",
       "      <td>36.493389</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[-0.03584866, -0.00017609358, 0.00922457, -0.0...</td>\n",
       "      <td>We introduce a long short-term memory recurren...</td>\n",
       "      <td>Speech-driven 3 D Facial Animation with Implic...</td>\n",
       "      <td>171</td>\n",
       "      <td>27.971348</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>\n",
       "    <div class=\"colab-df-buttons\">\n",
       "\n",
       "  <div class=\"colab-df-container\">\n",
       "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-a69f5ed7-820c-470a-bb2c-961f795af416')\"\n",
       "            title=\"Convert this dataframe to an interactive table.\"\n",
       "            style=\"display:none;\">\n",
       "\n",
       "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
       "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
       "  </svg>\n",
       "    </button>\n",
       "\n",
       "  <style>\n",
       "    .colab-df-container {\n",
       "      display:flex;\n",
       "      gap: 12px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert {\n",
       "      background-color: #E8F0FE;\n",
       "      border: none;\n",
       "      border-radius: 50%;\n",
       "      cursor: pointer;\n",
       "      display: none;\n",
       "      fill: #1967D2;\n",
       "      height: 32px;\n",
       "      padding: 0 0 0 0;\n",
       "      width: 32px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert:hover {\n",
       "      background-color: #E2EBFA;\n",
       "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
       "      fill: #174EA6;\n",
       "    }\n",
       "\n",
       "    .colab-df-buttons div {\n",
       "      margin-bottom: 4px;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert {\n",
       "      background-color: #3B4455;\n",
       "      fill: #D2E3FC;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert:hover {\n",
       "      background-color: #434B5C;\n",
       "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
       "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
       "      fill: #FFFFFF;\n",
       "    }\n",
       "  </style>\n",
       "\n",
       "    <script>\n",
       "      const buttonEl =\n",
       "        document.querySelector('#df-a69f5ed7-820c-470a-bb2c-961f795af416 button.colab-df-convert');\n",
       "      buttonEl.style.display =\n",
       "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
       "\n",
       "      async function convertToInteractive(key) {\n",
       "        const element = document.querySelector('#df-a69f5ed7-820c-470a-bb2c-961f795af416');\n",
       "        const dataTable =\n",
       "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
       "                                                    [key], {});\n",
       "        if (!dataTable) return;\n",
       "\n",
       "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
       "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
       "          + ' to learn more about interactive tables.';\n",
       "        element.innerHTML = '';\n",
       "        dataTable['output_type'] = 'display_data';\n",
       "        await google.colab.output.renderOutput(dataTable, element);\n",
       "        const docLink = document.createElement('div');\n",
       "        docLink.innerHTML = docLinkHtml;\n",
       "        element.appendChild(docLink);\n",
       "      }\n",
       "    </script>\n",
       "  </div>\n",
       "\n",
       "\n",
       "    <div id=\"df-d575d7a3-8595-414c-92a4-cd1f2c602d61\">\n",
       "      <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-d575d7a3-8595-414c-92a4-cd1f2c602d61')\"\n",
       "                title=\"Suggest charts\"\n",
       "                style=\"display:none;\">\n",
       "\n",
       "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
       "     width=\"24px\">\n",
       "    <g>\n",
       "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
       "    </g>\n",
       "</svg>\n",
       "      </button>\n",
       "\n",
       "<style>\n",
       "  .colab-df-quickchart {\n",
       "      --bg-color: #E8F0FE;\n",
       "      --fill-color: #1967D2;\n",
       "      --hover-bg-color: #E2EBFA;\n",
       "      --hover-fill-color: #174EA6;\n",
       "      --disabled-fill-color: #AAA;\n",
       "      --disabled-bg-color: #DDD;\n",
       "  }\n",
       "\n",
       "  [theme=dark] .colab-df-quickchart {\n",
       "      --bg-color: #3B4455;\n",
       "      --fill-color: #D2E3FC;\n",
       "      --hover-bg-color: #434B5C;\n",
       "      --hover-fill-color: #FFFFFF;\n",
       "      --disabled-bg-color: #3B4455;\n",
       "      --disabled-fill-color: #666;\n",
       "  }\n",
       "\n",
       "  .colab-df-quickchart {\n",
       "    background-color: var(--bg-color);\n",
       "    border: none;\n",
       "    border-radius: 50%;\n",
       "    cursor: pointer;\n",
       "    display: none;\n",
       "    fill: var(--fill-color);\n",
       "    height: 32px;\n",
       "    padding: 0;\n",
       "    width: 32px;\n",
       "  }\n",
       "\n",
       "  .colab-df-quickchart:hover {\n",
       "    background-color: var(--hover-bg-color);\n",
       "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
       "    fill: var(--button-hover-fill-color);\n",
       "  }\n",
       "\n",
       "  .colab-df-quickchart-complete:disabled,\n",
       "  .colab-df-quickchart-complete:disabled:hover {\n",
       "    background-color: var(--disabled-bg-color);\n",
       "    fill: var(--disabled-fill-color);\n",
       "    box-shadow: none;\n",
       "  }\n",
       "\n",
       "  .colab-df-spinner {\n",
       "    border: 2px solid var(--fill-color);\n",
       "    border-color: transparent;\n",
       "    border-bottom-color: var(--fill-color);\n",
       "    animation:\n",
       "      spin 1s steps(1) infinite;\n",
       "  }\n",
       "\n",
       "  @keyframes spin {\n",
       "    0% {\n",
       "      border-color: transparent;\n",
       "      border-bottom-color: var(--fill-color);\n",
       "      border-left-color: var(--fill-color);\n",
       "    }\n",
       "    20% {\n",
       "      border-color: transparent;\n",
       "      border-left-color: var(--fill-color);\n",
       "      border-top-color: var(--fill-color);\n",
       "    }\n",
       "    30% {\n",
       "      border-color: transparent;\n",
       "      border-left-color: var(--fill-color);\n",
       "      border-top-color: var(--fill-color);\n",
       "      border-right-color: var(--fill-color);\n",
       "    }\n",
       "    40% {\n",
       "      border-color: transparent;\n",
       "      border-right-color: var(--fill-color);\n",
       "      border-top-color: var(--fill-color);\n",
       "    }\n",
       "    60% {\n",
       "      border-color: transparent;\n",
       "      border-right-color: var(--fill-color);\n",
       "    }\n",
       "    80% {\n",
       "      border-color: transparent;\n",
       "      border-right-color: var(--fill-color);\n",
       "      border-bottom-color: var(--fill-color);\n",
       "    }\n",
       "    90% {\n",
       "      border-color: transparent;\n",
       "      border-bottom-color: var(--fill-color);\n",
       "    }\n",
       "  }\n",
       "</style>\n",
       "\n",
       "      <script>\n",
       "        async function quickchart(key) {\n",
       "          const quickchartButtonEl =\n",
       "            document.querySelector('#' + key + ' button');\n",
       "          quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
       "          quickchartButtonEl.classList.add('colab-df-spinner');\n",
       "          try {\n",
       "            const charts = await google.colab.kernel.invokeFunction(\n",
       "                'suggestCharts', [key], {});\n",
       "          } catch (error) {\n",
       "            console.error('Error during call to suggestCharts:', error);\n",
       "          }\n",
       "          quickchartButtonEl.classList.remove('colab-df-spinner');\n",
       "          quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
       "        }\n",
       "        (() => {\n",
       "          let quickchartButtonEl =\n",
       "            document.querySelector('#df-d575d7a3-8595-414c-92a4-cd1f2c602d61 button');\n",
       "          quickchartButtonEl.style.display =\n",
       "            google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
       "        })();\n",
       "      </script>\n",
       "    </div>\n",
       "\n",
       "    </div>\n",
       "  </div>\n"
      ],
      "text/plain": [
       "                                              vector  \\\n",
       "0  [-0.012233193, -0.0030392595, 0.022818677, -0....   \n",
       "1  [-0.00843631, -0.013696598, 0.0075827544, -0.0...   \n",
       "2  [0.005474513, -0.0017037347, 0.009329896, -0.0...   \n",
       "3  [-0.010289601, -0.0014841412, -0.0050512585, -...   \n",
       "4  [-0.03584866, -0.00017609358, 0.00922457, -0.0...   \n",
       "\n",
       "                                                text  \\\n",
       "0  Sociological and technical difficulties, such ...   \n",
       "1  Iron, the most ubiquitous of the transition me...   \n",
       "2  Mobility prediction is one of the most essenti...   \n",
       "3  It's not surprisingly when entering this site ...   \n",
       "4  We introduce a long short-term memory recurren...   \n",
       "\n",
       "                                               title  num_words     _score  \n",
       "0  Hipikat: a project memory for software develop...        210  48.126713  \n",
       "1  Synthesis, properties, and applications of iro...        158  47.561138  \n",
       "2  A data mining approach for location prediction...        135  43.397488  \n",
       "3  Genetic Fuzzy Systems - Evolutionary Tuning an...        100  36.493389  \n",
       "4  Speech-driven 3 D Facial Animation with Implic...        171  27.971348  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "table.search(\n",
    "    \"To confuse the AI and DNN embedding, let's put random terms from other sentences- automation training test memory?\",\n",
    "    query_type=\"fts\",\n",
    ").limit(5).to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 363
    },
    "id": "1wo7nfR0yhk7",
    "outputId": "9b211f6b-2ecc-4de4-db59-3531284dc9a7"
   },
   "outputs": [
    {
     "data": {
      "application/vnd.google.colaboratory.intrinsic+json": {
       "summary": "{\n  \"name\": \")\",\n  \"rows\": 10,\n  \"fields\": [\n    {\n      \"column\": \"vector\",\n      \"properties\": {\n        \"dtype\": \"object\",\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"text\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 10,\n        \"samples\": [\n          \"With the development of online advertisements, clickbait spread wider and wider. Clickbait dissatisfies users because the article content does not match their expectation. Thus, clickbait detection has attracted more and more attention recently. Traditional clickbait-detection methods rely on heavy feature engineering and fail to distinguish clickbait from normal headlines precisely because of the limited information in headlines. A convolutional neural network is useful for clickbait detection, since it utilizes pretrained Word2Vec to understand the headlines semantically, and employs different kernels to find various characteristics of the headlines. However, different types of articles tend to use different ways to draw users\\u2019 attention, and a pretrained Word2Vec model cannot distinguish these different ways. To address this issue, we propose a clickbait convolutional neural network (CBCNN) to consider not only the overall characteristics but also specific characteristics from different article types. Our experimental results show that our method outperforms traditional clickbait-detection algorithms and the TextCNN model in terms of precision, recall and accuracy.\",\n          \"a r t i c l e i n f o a b s t r a c t We present an automatic approach to the construction of BabelNet, a very large, wide-coverage multilingual semantic network. Key to our approach is the integration of lexicographic and encyclopedic knowledge from WordNet and Wikipedia. In addition, Machine Translation is applied to enrich the resource with lexical information for all languages. We first conduct in vitro experiments on new and existing gold-standard datasets to show the high quality and coverage of BabelNet. We then show that our lexical resource can be used successfully to perform both monolingual and cross-lingual Word Sense Disambiguation: thanks to its wide lexical coverage and novel semantic relations, we are able to achieve state-of the-art results on three different SemEval evaluation tasks.\",\n          \"This paper proposes quiz-style information presentation for interactive systems as a means to improve user understanding in educational tasks. Since the nature of quizzes can highly motivate users to stay voluntarily engaged in the interaction and keep their attention on receiving information, it is expected that information presented as quizzes can be better understood by users. To verify the effectiveness of the approach, we implemented read-out and quiz systems and performed comparison experiments using human subjects. In the task of memorizing biographical facts, the results showed that user understanding for the quiz system was significantly better than that for the read-out system, and that the subjects were more willing to use the quiz system despite the long duration of the quizzes. This indicates that quiz-style information presentation promotes engagement in the interaction with the system, leading to the improved user understanding.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"title\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 10,\n        \"samples\": [\n          \"Clickbait Convolutional Neural Network\",\n          \"BabelNet: The automatic construction, evaluation and application of a wide-coverage multilingual semantic network\",\n          \"Effects of quiz-style information presentation on user understanding\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"num_words\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 23,\n        \"min\": 112,\n        \"max\": 187,\n        \"num_unique_values\": 10,\n        \"samples\": [\n          160,\n          133,\n          141\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"_distance\",\n      \"properties\": {\n        \"dtype\": \"float32\",\n        \"num_unique_values\": 10,\n        \"samples\": [\n          0.4483324885368347,\n          0.40246960520744324,\n          0.44294866919517517\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
       "type": "dataframe"
      },
      "text/html": [
       "\n",
       "  <div id=\"df-4d6e13d2-1ebd-4569-8ef6-4724b3f0af36\" class=\"colab-df-container\">\n",
       "    <div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>vector</th>\n",
       "      <th>text</th>\n",
       "      <th>title</th>\n",
       "      <th>num_words</th>\n",
       "      <th>_distance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[-0.02901112, -0.0019353012, 0.010076279, -0.0...</td>\n",
       "      <td>Classifying short texts to one category or clu...</td>\n",
       "      <td>Using deep learning for short text understanding</td>\n",
       "      <td>187</td>\n",
       "      <td>0.354657</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[-0.01352904, -0.0019552337, 0.0095874, -0.030...</td>\n",
       "      <td>a r t i c l e i n f o a b s t r a c t We prese...</td>\n",
       "      <td>BabelNet: The automatic construction, evaluati...</td>\n",
       "      <td>133</td>\n",
       "      <td>0.402470</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[-0.027341157, -0.0001916921, 0.017680977, -0....</td>\n",
       "      <td>We introduce a technique for augmenting neural...</td>\n",
       "      <td>Deep Voice 2 : Multi-Speaker Neural Text-to-Sp...</td>\n",
       "      <td>151</td>\n",
       "      <td>0.417746</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[-0.03584866, -0.00017609358, 0.00922457, -0.0...</td>\n",
       "      <td>We introduce a long short-term memory recurren...</td>\n",
       "      <td>Speech-driven 3 D Facial Animation with Implic...</td>\n",
       "      <td>171</td>\n",
       "      <td>0.421450</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[-0.010492763, -0.006538273, 0.01853196, -0.03...</td>\n",
       "      <td>The presented work aims at generating a system...</td>\n",
       "      <td>BCSAT : A Benchmark Corpus for Sentiment Analy...</td>\n",
       "      <td>126</td>\n",
       "      <td>0.429735</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>[-0.03796007, 0.010790204, 0.029059827, -0.034...</td>\n",
       "      <td>This paper proposes quiz-style information pre...</td>\n",
       "      <td>Effects of quiz-style information presentation...</td>\n",
       "      <td>141</td>\n",
       "      <td>0.442949</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>[-0.03084091, -0.002591715, 0.010260329, -0.01...</td>\n",
       "      <td>This report summarizes the objectives and eval...</td>\n",
       "      <td>SemEval-2015 Task 11: Sentiment Analysis of Fi...</td>\n",
       "      <td>112</td>\n",
       "      <td>0.444655</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>[-0.00980282, 0.007281899, 0.009843921, -0.031...</td>\n",
       "      <td>This paper presents a simple unsupervised lear...</td>\n",
       "      <td>Thumbs Up or Thumbs Down? Semantic Orientation...</td>\n",
       "      <td>163</td>\n",
       "      <td>0.448142</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>[-0.027524939, 0.017282786, 0.016536668, -0.01...</td>\n",
       "      <td>With the development of online advertisements,...</td>\n",
       "      <td>Clickbait Convolutional Neural Network</td>\n",
       "      <td>160</td>\n",
       "      <td>0.448332</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>[-0.016280893, 0.010584505, 0.030072825, -0.01...</td>\n",
       "      <td>Theories on the functions of the hippocampal s...</td>\n",
       "      <td>Memory, navigation and theta rhythm in the hip...</td>\n",
       "      <td>130</td>\n",
       "      <td>0.453260</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>\n",
       "    <div class=\"colab-df-buttons\">\n",
       "\n",
       "  <div class=\"colab-df-container\">\n",
       "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-4d6e13d2-1ebd-4569-8ef6-4724b3f0af36')\"\n",
       "            title=\"Convert this dataframe to an interactive table.\"\n",
       "            style=\"display:none;\">\n",
       "\n",
       "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
       "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
       "  </svg>\n",
       "    </button>\n",
       "\n",
       "  <style>\n",
       "    .colab-df-container {\n",
       "      display:flex;\n",
       "      gap: 12px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert {\n",
       "      background-color: #E8F0FE;\n",
       "      border: none;\n",
       "      border-radius: 50%;\n",
       "      cursor: pointer;\n",
       "      display: none;\n",
       "      fill: #1967D2;\n",
       "      height: 32px;\n",
       "      padding: 0 0 0 0;\n",
       "      width: 32px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert:hover {\n",
       "      background-color: #E2EBFA;\n",
       "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
       "      fill: #174EA6;\n",
       "    }\n",
       "\n",
       "    .colab-df-buttons div {\n",
       "      margin-bottom: 4px;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert {\n",
       "      background-color: #3B4455;\n",
       "      fill: #D2E3FC;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert:hover {\n",
       "      background-color: #434B5C;\n",
       "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
       "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
       "      fill: #FFFFFF;\n",
       "    }\n",
       "  </style>\n",
       "\n",
       "    <script>\n",
       "      const buttonEl =\n",
       "        document.querySelector('#df-4d6e13d2-1ebd-4569-8ef6-4724b3f0af36 button.colab-df-convert');\n",
       "      buttonEl.style.display =\n",
       "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
       "\n",
       "      async function convertToInteractive(key) {\n",
       "        const element = document.querySelector('#df-4d6e13d2-1ebd-4569-8ef6-4724b3f0af36');\n",
       "        const dataTable =\n",
       "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
       "                                                    [key], {});\n",
       "        if (!dataTable) return;\n",
       "\n",
       "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
       "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
       "          + ' to learn more about interactive tables.';\n",
       "        element.innerHTML = '';\n",
       "        dataTable['output_type'] = 'display_data';\n",
       "        await google.colab.output.renderOutput(dataTable, element);\n",
       "        const docLink = document.createElement('div');\n",
       "        docLink.innerHTML = docLinkHtml;\n",
       "        element.appendChild(docLink);\n",
       "      }\n",
       "    </script>\n",
       "  </div>\n",
       "\n",
       "\n",
       "    <div id=\"df-2555b3d6-f543-42ba-8c55-4f0e9fc28922\">\n",
       "      <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-2555b3d6-f543-42ba-8c55-4f0e9fc28922')\"\n",
       "                title=\"Suggest charts\"\n",
       "                style=\"display:none;\">\n",
       "\n",
       "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
       "     width=\"24px\">\n",
       "    <g>\n",
       "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
       "    </g>\n",
       "</svg>\n",
       "      </button>\n",
       "\n",
       "<style>\n",
       "  .colab-df-quickchart {\n",
       "      --bg-color: #E8F0FE;\n",
       "      --fill-color: #1967D2;\n",
       "      --hover-bg-color: #E2EBFA;\n",
       "      --hover-fill-color: #174EA6;\n",
       "      --disabled-fill-color: #AAA;\n",
       "      --disabled-bg-color: #DDD;\n",
       "  }\n",
       "\n",
       "  [theme=dark] .colab-df-quickchart {\n",
       "      --bg-color: #3B4455;\n",
       "      --fill-color: #D2E3FC;\n",
       "      --hover-bg-color: #434B5C;\n",
       "      --hover-fill-color: #FFFFFF;\n",
       "      --disabled-bg-color: #3B4455;\n",
       "      --disabled-fill-color: #666;\n",
       "  }\n",
       "\n",
       "  .colab-df-quickchart {\n",
       "    background-color: var(--bg-color);\n",
       "    border: none;\n",
       "    border-radius: 50%;\n",
       "    cursor: pointer;\n",
       "    display: none;\n",
       "    fill: var(--fill-color);\n",
       "    height: 32px;\n",
       "    padding: 0;\n",
       "    width: 32px;\n",
       "  }\n",
       "\n",
       "  .colab-df-quickchart:hover {\n",
       "    background-color: var(--hover-bg-color);\n",
       "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
       "    fill: var(--button-hover-fill-color);\n",
       "  }\n",
       "\n",
       "  .colab-df-quickchart-complete:disabled,\n",
       "  .colab-df-quickchart-complete:disabled:hover {\n",
       "    background-color: var(--disabled-bg-color);\n",
       "    fill: var(--disabled-fill-color);\n",
       "    box-shadow: none;\n",
       "  }\n",
       "\n",
       "  .colab-df-spinner {\n",
       "    border: 2px solid var(--fill-color);\n",
       "    border-color: transparent;\n",
       "    border-bottom-color: var(--fill-color);\n",
       "    animation:\n",
       "      spin 1s steps(1) infinite;\n",
       "  }\n",
       "\n",
       "  @keyframes spin {\n",
       "    0% {\n",
       "      border-color: transparent;\n",
       "      border-bottom-color: var(--fill-color);\n",
       "      border-left-color: var(--fill-color);\n",
       "    }\n",
       "    20% {\n",
       "      border-color: transparent;\n",
       "      border-left-color: var(--fill-color);\n",
       "      border-top-color: var(--fill-color);\n",
       "    }\n",
       "    30% {\n",
       "      border-color: transparent;\n",
       "      border-left-color: var(--fill-color);\n",
       "      border-top-color: var(--fill-color);\n",
       "      border-right-color: var(--fill-color);\n",
       "    }\n",
       "    40% {\n",
       "      border-color: transparent;\n",
       "      border-right-color: var(--fill-color);\n",
       "      border-top-color: var(--fill-color);\n",
       "    }\n",
       "    60% {\n",
       "      border-color: transparent;\n",
       "      border-right-color: var(--fill-color);\n",
       "    }\n",
       "    80% {\n",
       "      border-color: transparent;\n",
       "      border-right-color: var(--fill-color);\n",
       "      border-bottom-color: var(--fill-color);\n",
       "    }\n",
       "    90% {\n",
       "      border-color: transparent;\n",
       "      border-bottom-color: var(--fill-color);\n",
       "    }\n",
       "  }\n",
       "</style>\n",
       "\n",
       "      <script>\n",
       "        async function quickchart(key) {\n",
       "          const quickchartButtonEl =\n",
       "            document.querySelector('#' + key + ' button');\n",
       "          quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
       "          quickchartButtonEl.classList.add('colab-df-spinner');\n",
       "          try {\n",
       "            const charts = await google.colab.kernel.invokeFunction(\n",
       "                'suggestCharts', [key], {});\n",
       "          } catch (error) {\n",
       "            console.error('Error during call to suggestCharts:', error);\n",
       "          }\n",
       "          quickchartButtonEl.classList.remove('colab-df-spinner');\n",
       "          quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
       "        }\n",
       "        (() => {\n",
       "          let quickchartButtonEl =\n",
       "            document.querySelector('#df-2555b3d6-f543-42ba-8c55-4f0e9fc28922 button');\n",
       "          quickchartButtonEl.style.display =\n",
       "            google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
       "        })();\n",
       "      </script>\n",
       "    </div>\n",
       "\n",
       "    </div>\n",
       "  </div>\n"
      ],
      "text/plain": [
       "                                              vector  \\\n",
       "0  [-0.02901112, -0.0019353012, 0.010076279, -0.0...   \n",
       "1  [-0.01352904, -0.0019552337, 0.0095874, -0.030...   \n",
       "2  [-0.027341157, -0.0001916921, 0.017680977, -0....   \n",
       "3  [-0.03584866, -0.00017609358, 0.00922457, -0.0...   \n",
       "4  [-0.010492763, -0.006538273, 0.01853196, -0.03...   \n",
       "5  [-0.03796007, 0.010790204, 0.029059827, -0.034...   \n",
       "6  [-0.03084091, -0.002591715, 0.010260329, -0.01...   \n",
       "7  [-0.00980282, 0.007281899, 0.009843921, -0.031...   \n",
       "8  [-0.027524939, 0.017282786, 0.016536668, -0.01...   \n",
       "9  [-0.016280893, 0.010584505, 0.030072825, -0.01...   \n",
       "\n",
       "                                                text  \\\n",
       "0  Classifying short texts to one category or clu...   \n",
       "1  a r t i c l e i n f o a b s t r a c t We prese...   \n",
       "2  We introduce a technique for augmenting neural...   \n",
       "3  We introduce a long short-term memory recurren...   \n",
       "4  The presented work aims at generating a system...   \n",
       "5  This paper proposes quiz-style information pre...   \n",
       "6  This report summarizes the objectives and eval...   \n",
       "7  This paper presents a simple unsupervised lear...   \n",
       "8  With the development of online advertisements,...   \n",
       "9  Theories on the functions of the hippocampal s...   \n",
       "\n",
       "                                               title  num_words  _distance  \n",
       "0   Using deep learning for short text understanding        187   0.354657  \n",
       "1  BabelNet: The automatic construction, evaluati...        133   0.402470  \n",
       "2  Deep Voice 2 : Multi-Speaker Neural Text-to-Sp...        151   0.417746  \n",
       "3  Speech-driven 3 D Facial Animation with Implic...        171   0.421450  \n",
       "4  BCSAT : A Benchmark Corpus for Sentiment Analy...        126   0.429735  \n",
       "5  Effects of quiz-style information presentation...        141   0.442949  \n",
       "6  SemEval-2015 Task 11: Sentiment Analysis of Fi...        112   0.444655  \n",
       "7  Thumbs Up or Thumbs Down? Semantic Orientation...        163   0.448142  \n",
       "8             Clickbait Convolutional Neural Network        160   0.448332  \n",
       "9  Memory, navigation and theta rhythm in the hip...        130   0.453260  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "table.search(\n",
    "    \"To confuse the AI and DNN embedding, let's put random terms from other sentences- automation training test memory?\",\n",
    "    query_type=\"vector\",\n",
    ").limit(10).to_pandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "e_AH1IxlFPn-"
   },
   "source": [
    "## Perform inbuilt Hybrid Search\n",
    "They have some off the shelf functionalities and a way to implement the custom Re-Ranking and Filtering Function here [Implement Custom Rerankers](https://lancedb.github.io/lancedb/hybrid_search/#building-custom-rerankers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 206
    },
    "id": "Se27vipkFSzf",
    "outputId": "f1e502ed-9fea-4f00-d8e3-298a93dc5380"
   },
   "outputs": [
    {
     "data": {
      "application/vnd.google.colaboratory.intrinsic+json": {
       "summary": "{\n  \"name\": \")\",\n  \"rows\": 5,\n  \"fields\": [\n    {\n      \"column\": \"vector\",\n      \"properties\": {\n        \"dtype\": \"object\",\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"text\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 5,\n        \"samples\": [\n          \"It's not surprisingly when entering this site to get the book. One of the popular books now is the genetic fuzzy systems evolutionary tuning and learning of fuzzy knowledge bases. You may be confused because you can't find the book in the book store around your city. Commonly, the popular book will be sold quickly. And when you have found the store to buy the book, it will be so hurt when you run out of it. This is why, searching for this popular book in this website will give you benefit. You will not run out of this book.\",\n          \"The presented work aims at generating a systematically annotated corpus that can support the enhancement of sentiment analysis tasks in Telugu using wordlevel sentiment annotations. From OntoSenseNet, we extracted 11,000 adjectives, 253 adverbs, 8483 verbs and sentiment annotation is being done by language experts. We discuss the methodology followed for the polarity annotations and validate the developed resource. This work aims at developing a benchmark corpus, as an extension to SentiWordNet, and baseline accuracy for a model where lexeme annotations are applied for sentiment predictions. The fundamental aim of this paper is to validate and study the possibility of utilizing machine learning algorithms, word-level sentiment annotations in the task of automated sentiment identification. Furthermore, accuracy is improved by annotating the bi-grams extracted from the target corpus.\",\n          \"Mobility prediction is one of the most essential issues that need to be explored for mobility management in mobile computing systems. In this paper, we propose a new algorithm for predicting the next inter-cell movement of a mobile user in a Personal Communication Systems network. In the first phase of our threephase algorithm, user mobility patterns are mined from the history of mobile user trajectories. In the second phase, mobility rules are extracted from these patterns, and in the last phase, mobility predictions are accomplished by using these rules. The performance of the proposed algorithm is evaluated through simulation as compared to two other prediction methods. The performance results obtained in terms of Precision and Recall indicate that our method can make more accurate predictions than the other methods. 2004 Elsevier B.V. All rights reserved.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"title\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 5,\n        \"samples\": [\n          \"Genetic Fuzzy Systems - Evolutionary Tuning and Learning of Fuzzy Knowledge Bases\",\n          \"BCSAT : A Benchmark Corpus for Sentiment Analysis in Telugu Using Word-level Annotations\",\n          \"A data mining approach for location prediction in mobile environments\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"num_words\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 27,\n        \"min\": 100,\n        \"max\": 171,\n        \"num_unique_values\": 5,\n        \"samples\": [\n          100,\n          126,\n          135\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"_relevance_score\",\n      \"properties\": {\n        \"dtype\": \"float32\",\n        \"num_unique_values\": 5,\n        \"samples\": [\n          0.8731547594070435,\n          0.699999988079071,\n          0.770391583442688\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
       "type": "dataframe"
      },
      "text/html": [
       "\n",
       "  <div id=\"df-e24a8062-9fb7-4878-b499-6799cb712375\" class=\"colab-df-container\">\n",
       "    <div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>vector</th>\n",
       "      <th>text</th>\n",
       "      <th>title</th>\n",
       "      <th>num_words</th>\n",
       "      <th>_relevance_score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[-0.03584866, -0.00017609358, 0.00922457, -0.0...</td>\n",
       "      <td>We introduce a long short-term memory recurren...</td>\n",
       "      <td>Speech-driven 3 D Facial Animation with Implic...</td>\n",
       "      <td>171</td>\n",
       "      <td>0.921843</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[-0.010289601, -0.0014841412, -0.0050512585, -...</td>\n",
       "      <td>It's not surprisingly when entering this site ...</td>\n",
       "      <td>Genetic Fuzzy Systems - Evolutionary Tuning an...</td>\n",
       "      <td>100</td>\n",
       "      <td>0.873155</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[0.005474513, -0.0017037347, 0.009329896, -0.0...</td>\n",
       "      <td>Mobility prediction is one of the most essenti...</td>\n",
       "      <td>A data mining approach for location prediction...</td>\n",
       "      <td>135</td>\n",
       "      <td>0.770392</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[-0.00843631, -0.013696598, 0.0075827544, -0.0...</td>\n",
       "      <td>Iron, the most ubiquitous of the transition me...</td>\n",
       "      <td>Synthesis, properties, and applications of iro...</td>\n",
       "      <td>158</td>\n",
       "      <td>0.708418</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[-0.010492763, -0.006538273, 0.01853196, -0.03...</td>\n",
       "      <td>The presented work aims at generating a system...</td>\n",
       "      <td>BCSAT : A Benchmark Corpus for Sentiment Analy...</td>\n",
       "      <td>126</td>\n",
       "      <td>0.700000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>\n",
       "    <div class=\"colab-df-buttons\">\n",
       "\n",
       "  <div class=\"colab-df-container\">\n",
       "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-e24a8062-9fb7-4878-b499-6799cb712375')\"\n",
       "            title=\"Convert this dataframe to an interactive table.\"\n",
       "            style=\"display:none;\">\n",
       "\n",
       "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
       "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
       "  </svg>\n",
       "    </button>\n",
       "\n",
       "  <style>\n",
       "    .colab-df-container {\n",
       "      display:flex;\n",
       "      gap: 12px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert {\n",
       "      background-color: #E8F0FE;\n",
       "      border: none;\n",
       "      border-radius: 50%;\n",
       "      cursor: pointer;\n",
       "      display: none;\n",
       "      fill: #1967D2;\n",
       "      height: 32px;\n",
       "      padding: 0 0 0 0;\n",
       "      width: 32px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert:hover {\n",
       "      background-color: #E2EBFA;\n",
       "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
       "      fill: #174EA6;\n",
       "    }\n",
       "\n",
       "    .colab-df-buttons div {\n",
       "      margin-bottom: 4px;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert {\n",
       "      background-color: #3B4455;\n",
       "      fill: #D2E3FC;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert:hover {\n",
       "      background-color: #434B5C;\n",
       "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
       "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
       "      fill: #FFFFFF;\n",
       "    }\n",
       "  </style>\n",
       "\n",
       "    <script>\n",
       "      const buttonEl =\n",
       "        document.querySelector('#df-e24a8062-9fb7-4878-b499-6799cb712375 button.colab-df-convert');\n",
       "      buttonEl.style.display =\n",
       "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
       "\n",
       "      async function convertToInteractive(key) {\n",
       "        const element = document.querySelector('#df-e24a8062-9fb7-4878-b499-6799cb712375');\n",
       "        const dataTable =\n",
       "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
       "                                                    [key], {});\n",
       "        if (!dataTable) return;\n",
       "\n",
       "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
       "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
       "          + ' to learn more about interactive tables.';\n",
       "        element.innerHTML = '';\n",
       "        dataTable['output_type'] = 'display_data';\n",
       "        await google.colab.output.renderOutput(dataTable, element);\n",
       "        const docLink = document.createElement('div');\n",
       "        docLink.innerHTML = docLinkHtml;\n",
       "        element.appendChild(docLink);\n",
       "      }\n",
       "    </script>\n",
       "  </div>\n",
       "\n",
       "\n",
       "    <div id=\"df-5de9efef-969e-436f-9ce4-c0eac017f8cc\">\n",
       "      <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-5de9efef-969e-436f-9ce4-c0eac017f8cc')\"\n",
       "                title=\"Suggest charts\"\n",
       "                style=\"display:none;\">\n",
       "\n",
       "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
       "     width=\"24px\">\n",
       "    <g>\n",
       "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
       "    </g>\n",
       "</svg>\n",
       "      </button>\n",
       "\n",
       "<style>\n",
       "  .colab-df-quickchart {\n",
       "      --bg-color: #E8F0FE;\n",
       "      --fill-color: #1967D2;\n",
       "      --hover-bg-color: #E2EBFA;\n",
       "      --hover-fill-color: #174EA6;\n",
       "      --disabled-fill-color: #AAA;\n",
       "      --disabled-bg-color: #DDD;\n",
       "  }\n",
       "\n",
       "  [theme=dark] .colab-df-quickchart {\n",
       "      --bg-color: #3B4455;\n",
       "      --fill-color: #D2E3FC;\n",
       "      --hover-bg-color: #434B5C;\n",
       "      --hover-fill-color: #FFFFFF;\n",
       "      --disabled-bg-color: #3B4455;\n",
       "      --disabled-fill-color: #666;\n",
       "  }\n",
       "\n",
       "  .colab-df-quickchart {\n",
       "    background-color: var(--bg-color);\n",
       "    border: none;\n",
       "    border-radius: 50%;\n",
       "    cursor: pointer;\n",
       "    display: none;\n",
       "    fill: var(--fill-color);\n",
       "    height: 32px;\n",
       "    padding: 0;\n",
       "    width: 32px;\n",
       "  }\n",
       "\n",
       "  .colab-df-quickchart:hover {\n",
       "    background-color: var(--hover-bg-color);\n",
       "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
       "    fill: var(--button-hover-fill-color);\n",
       "  }\n",
       "\n",
       "  .colab-df-quickchart-complete:disabled,\n",
       "  .colab-df-quickchart-complete:disabled:hover {\n",
       "    background-color: var(--disabled-bg-color);\n",
       "    fill: var(--disabled-fill-color);\n",
       "    box-shadow: none;\n",
       "  }\n",
       "\n",
       "  .colab-df-spinner {\n",
       "    border: 2px solid var(--fill-color);\n",
       "    border-color: transparent;\n",
       "    border-bottom-color: var(--fill-color);\n",
       "    animation:\n",
       "      spin 1s steps(1) infinite;\n",
       "  }\n",
       "\n",
       "  @keyframes spin {\n",
       "    0% {\n",
       "      border-color: transparent;\n",
       "      border-bottom-color: var(--fill-color);\n",
       "      border-left-color: var(--fill-color);\n",
       "    }\n",
       "    20% {\n",
       "      border-color: transparent;\n",
       "      border-left-color: var(--fill-color);\n",
       "      border-top-color: var(--fill-color);\n",
       "    }\n",
       "    30% {\n",
       "      border-color: transparent;\n",
       "      border-left-color: var(--fill-color);\n",
       "      border-top-color: var(--fill-color);\n",
       "      border-right-color: var(--fill-color);\n",
       "    }\n",
       "    40% {\n",
       "      border-color: transparent;\n",
       "      border-right-color: var(--fill-color);\n",
       "      border-top-color: var(--fill-color);\n",
       "    }\n",
       "    60% {\n",
       "      border-color: transparent;\n",
       "      border-right-color: var(--fill-color);\n",
       "    }\n",
       "    80% {\n",
       "      border-color: transparent;\n",
       "      border-right-color: var(--fill-color);\n",
       "      border-bottom-color: var(--fill-color);\n",
       "    }\n",
       "    90% {\n",
       "      border-color: transparent;\n",
       "      border-bottom-color: var(--fill-color);\n",
       "    }\n",
       "  }\n",
       "</style>\n",
       "\n",
       "      <script>\n",
       "        async function quickchart(key) {\n",
       "          const quickchartButtonEl =\n",
       "            document.querySelector('#' + key + ' button');\n",
       "          quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
       "          quickchartButtonEl.classList.add('colab-df-spinner');\n",
       "          try {\n",
       "            const charts = await google.colab.kernel.invokeFunction(\n",
       "                'suggestCharts', [key], {});\n",
       "          } catch (error) {\n",
       "            console.error('Error during call to suggestCharts:', error);\n",
       "          }\n",
       "          quickchartButtonEl.classList.remove('colab-df-spinner');\n",
       "          quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
       "        }\n",
       "        (() => {\n",
       "          let quickchartButtonEl =\n",
       "            document.querySelector('#df-5de9efef-969e-436f-9ce4-c0eac017f8cc button');\n",
       "          quickchartButtonEl.style.display =\n",
       "            google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
       "        })();\n",
       "      </script>\n",
       "    </div>\n",
       "\n",
       "    </div>\n",
       "  </div>\n"
      ],
      "text/plain": [
       "                                              vector  \\\n",
       "0  [-0.03584866, -0.00017609358, 0.00922457, -0.0...   \n",
       "1  [-0.010289601, -0.0014841412, -0.0050512585, -...   \n",
       "2  [0.005474513, -0.0017037347, 0.009329896, -0.0...   \n",
       "3  [-0.00843631, -0.013696598, 0.0075827544, -0.0...   \n",
       "4  [-0.010492763, -0.006538273, 0.01853196, -0.03...   \n",
       "\n",
       "                                                text  \\\n",
       "0  We introduce a long short-term memory recurren...   \n",
       "1  It's not surprisingly when entering this site ...   \n",
       "2  Mobility prediction is one of the most essenti...   \n",
       "3  Iron, the most ubiquitous of the transition me...   \n",
       "4  The presented work aims at generating a system...   \n",
       "\n",
       "                                               title  num_words  \\\n",
       "0  Speech-driven 3 D Facial Animation with Implic...        171   \n",
       "1  Genetic Fuzzy Systems - Evolutionary Tuning an...        100   \n",
       "2  A data mining approach for location prediction...        135   \n",
       "3  Synthesis, properties, and applications of iro...        158   \n",
       "4  BCSAT : A Benchmark Corpus for Sentiment Analy...        126   \n",
       "\n",
       "   _relevance_score  \n",
       "0          0.921843  \n",
       "1          0.873155  \n",
       "2          0.770392  \n",
       "3          0.708418  \n",
       "4          0.700000  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from lancedb.rerankers import LinearCombinationReranker\n",
    "\n",
    "reranker = LinearCombinationReranker(\n",
    "    weight=0.7\n",
    ")  # Weight = 0 Means pure Text Search (BM-25) and 1 means pure Sementic (Vector) Search\n",
    "\n",
    "table.search(\n",
    "    \"To confuse the AI and DNN embedding, let's put random terms from other sentences- automation training test memory?\",\n",
    "    query_type=\"hybrid\",\n",
    ").rerank(reranker=reranker).limit(5).to_pandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "Yw5OecIjE3IB"
   },
   "source": [
    "## Build custom Filtering Function\n",
    "\n",
    "By passing the `pandas.query` style, filtering, we will do the following 2 things:\n",
    "\n",
    "1. Remove all the rows which contain a specific term, in out case, `\"dual-band\"`\n",
    "2. Keep only the rows which have `num_words > 100`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 269
    },
    "id": "vfMIXT2vE9yv",
    "outputId": "2bf4158b-29ab-4358-f478-cde304d7a4a4"
   },
   "outputs": [
    {
     "data": {
      "application/vnd.google.colaboratory.intrinsic+json": {
       "summary": "{\n  \"name\": \")\",\n  \"rows\": 7,\n  \"fields\": [\n    {\n      \"column\": \"vector\",\n      \"properties\": {\n        \"dtype\": \"object\",\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"text\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"In this paper a novel approach is proposed to predict intraday directional-movements of a currency-pair in the foreign exchange market based on the text of breaking financial news-headlines. The motivation behind this work is twofold: First, although market-prediction through text-mining is shown to be a promising area of work in the literature, the text-mining approaches utilized in it at this stage are not much beyond basic ones as it is still an emerging field. This work is an effort to put more emphasis on the text-mining methods and tackle some specific aspects thereof that are weak in previous works, namely: the problem of high dimensionality as well as the problem of ignoring sentiment and semantics in dealing with textual language. This research assumes that addressing these aspects of text-mining have an impact on the quality of the achieved results. The proposed system proves this assumption to be right. The second part of the motivation is to research a specific market, namely, the foreign exchange market, which seems not to have been researched in the previous works based on predictive text-mining. Therefore, results of this work also successfully demonstrate a predictive relationship between this specific market-type and the textual data of news. Besides the above two main components of the motivation, there are other specific aspects that make the setup of the proposed system and the conducted experiment unique, for example, the use of news article-headlines only and not news article-bodies, which enables usage of short pieces of text rather than long ones; or the use of general financial breaking news without any further filtration. In order to accomplish the above, this work produces a multi-layer algorithm that tackles each of the mentioned aspects of the text-mining problem at a designated layer. The first layer is termed the Semantic Abstraction Layer and addresses the problem of co-reference in text mining that is contributing to sparsity. Co-reference occurs when two or more words in a text corpus refer to the same concept. This work produces a custom approach by the name of Heuristic-Hypernyms Feature-Selection which creates a way to recognize words with the same parent-word to be regarded as one entity. As a result, prediction accuracy increases significantly at this layer which is attributed to appropriate noise-reduction from the feature-space. The second layer is termed Sentiment Integration Layer, which integrates sentiment analysis capability into the algorithm by proposing a sentiment weight by the name of SumScore that reflects investors\\u2019 sentiment. Additionally, this layer reduces the dimensions by eliminating those that are of zero value in terms of sentiment and thereby improves prediction accuracy. The third layer encompasses a dynamic model creation algorithm, termed Synchronous Targeted Feature Reduction (STFR). It is suitable for the challenge at hand whereby the mining of a stream of text is concerned. It updates the models with the most recent information available and, more importantly, it ensures that the dimensions are reduced to the absolute minimum. The algorithm and each of its layers are extensively evaluated using real market data and news content across multiple years and have proven to be solid and superior to any other comparable solution. The proposed techniques implemented in the system, result in significantly high directional-accuracies of up to 83.33%. On top of a well-rounded multifaceted algorithm, this work contributes a much needed research framework for this context with a test-bed of data that must make future research endeavors more convenient. The produced algorithm is scalable and its modular design allows improvement in each of its layers in future research. This paper provides ample details to reproduce the entire system and the conducted experiments. 2014 Elsevier Ltd. All rights reserved. A. Khadjeh Nassirtoussi et al. / Expert Systems with Applications 42 (2015) 306\\u2013324 307\",\n          \"This paper introduces a new compositional framework for classifying color correction methods according to their two main computational units. The framework was used to dissect fifteen among the best color correction algorithms and the computational units so derived, with the addition of four new units specifically designed for this work, were then reassembled in a combinatorial way to originate about one hundred distinct color correction methods, most of which never considered before. The above color correction methods were tested on three different existing datasets, including both real and artificial color transformations, plus a novel dataset of real image pairs categorized according to the kind of color alterations induced by specific acquisition setups. Differently from previous evaluations, special emphasis was given to effectiveness in real world applications, such as image mosaicing and stitching, where robustness with respect to strong image misalignments and light scattering effects is required. Experimental evidence is provided for the first time in terms of the most recent perceptual image quality metrics, which are known to be the closest to human judgment. Comparative results show that combinations of the new computational units are the most effective for real stitching scenarios, regardless of the specific source of color alteration. On the other hand, in the case of accurate image alignment and artificial color alterations, the best performing methods either use one of the new computational units, or are made up of fresh combinations of existing units.\",\n          \"We introduce a technique for augmenting neural text-to-speech (TTS) with lowdimensional trainable speaker embeddings to generate different voices from a single model. As a starting point, we show improvements over the two state-ofthe-art approaches for single-speaker neural TTS: Deep Voice 1 and Tacotron. We introduce Deep Voice 2, which is based on a similar pipeline with Deep Voice 1, but constructed with higher performance building blocks and demonstrates a significant audio quality improvement over Deep Voice 1. We improve Tacotron by introducing a post-processing neural vocoder, and demonstrate a significant audio quality improvement. We then demonstrate our technique for multi-speaker speech synthesis for both Deep Voice 2 and Tacotron on two multi-speaker TTS datasets. We show that a single neural TTS system can learn hundreds of unique voices from less than half an hour of data per speaker, while achieving high audio quality synthesis and preserving the speaker identities almost perfectly.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"title\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"Text mining of news-headlines for FOREX market prediction: A Multi-layer Dimension Reduction Algorithm with semantics and sentiment\",\n          \"Dissecting and Reassembling Color Correction Algorithms for Image Stitching\",\n          \"Deep Voice 2 : Multi-Speaker Neural Text-to-Speech\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"num_words\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 167,\n        \"min\": 151,\n        \"max\": 621,\n        \"num_unique_values\": 7,\n        \"samples\": [\n          621,\n          237,\n          151\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"_relevance_score\",\n      \"properties\": {\n        \"dtype\": \"float32\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          1.0,\n          0.9020528197288513,\n          0.48952168226242065\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
       "type": "dataframe"
      },
      "text/html": [
       "\n",
       "  <div id=\"df-60d1bb38-f4ee-4f37-a97f-b87117a57a0e\" class=\"colab-df-container\">\n",
       "    <div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>vector</th>\n",
       "      <th>text</th>\n",
       "      <th>title</th>\n",
       "      <th>num_words</th>\n",
       "      <th>_relevance_score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[-0.027824033, -0.00014784517, 0.012952094, -0...</td>\n",
       "      <td>In this paper a novel approach is proposed to ...</td>\n",
       "      <td>Text mining of news-headlines for FOREX market...</td>\n",
       "      <td>621</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[0.008277873, 0.023134368, -0.0037977214, -0.0...</td>\n",
       "      <td>This paper introduces a new compositional fram...</td>\n",
       "      <td>Dissecting and Reassembling Color Correction A...</td>\n",
       "      <td>237</td>\n",
       "      <td>0.902053</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[-0.00843631, -0.013696598, 0.0075827544, -0.0...</td>\n",
       "      <td>Iron, the most ubiquitous of the transition me...</td>\n",
       "      <td>Synthesis, properties, and applications of iro...</td>\n",
       "      <td>158</td>\n",
       "      <td>0.705130</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>[-0.03584866, -0.00017609358, 0.00922457, -0.0...</td>\n",
       "      <td>We introduce a long short-term memory recurren...</td>\n",
       "      <td>Speech-driven 3 D Facial Animation with Implic...</td>\n",
       "      <td>171</td>\n",
       "      <td>0.701210</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>[-0.012233193, -0.0030392595, 0.022818677, -0....</td>\n",
       "      <td>Sociological and technical difficulties, such ...</td>\n",
       "      <td>Hipikat: a project memory for software develop...</td>\n",
       "      <td>210</td>\n",
       "      <td>0.700000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>[-0.027341157, -0.0001916921, 0.017680977, -0....</td>\n",
       "      <td>We introduce a technique for augmenting neural...</td>\n",
       "      <td>Deep Voice 2 : Multi-Speaker Neural Text-to-Sp...</td>\n",
       "      <td>151</td>\n",
       "      <td>0.489522</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>[-0.02901112, -0.0019353012, 0.010076279, -0.0...</td>\n",
       "      <td>Classifying short texts to one category or clu...</td>\n",
       "      <td>Using deep learning for short text understanding</td>\n",
       "      <td>187</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>\n",
       "    <div class=\"colab-df-buttons\">\n",
       "\n",
       "  <div class=\"colab-df-container\">\n",
       "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-60d1bb38-f4ee-4f37-a97f-b87117a57a0e')\"\n",
       "            title=\"Convert this dataframe to an interactive table.\"\n",
       "            style=\"display:none;\">\n",
       "\n",
       "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
       "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
       "  </svg>\n",
       "    </button>\n",
       "\n",
       "  <style>\n",
       "    .colab-df-container {\n",
       "      display:flex;\n",
       "      gap: 12px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert {\n",
       "      background-color: #E8F0FE;\n",
       "      border: none;\n",
       "      border-radius: 50%;\n",
       "      cursor: pointer;\n",
       "      display: none;\n",
       "      fill: #1967D2;\n",
       "      height: 32px;\n",
       "      padding: 0 0 0 0;\n",
       "      width: 32px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert:hover {\n",
       "      background-color: #E2EBFA;\n",
       "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
       "      fill: #174EA6;\n",
       "    }\n",
       "\n",
       "    .colab-df-buttons div {\n",
       "      margin-bottom: 4px;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert {\n",
       "      background-color: #3B4455;\n",
       "      fill: #D2E3FC;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert:hover {\n",
       "      background-color: #434B5C;\n",
       "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
       "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
       "      fill: #FFFFFF;\n",
       "    }\n",
       "  </style>\n",
       "\n",
       "    <script>\n",
       "      const buttonEl =\n",
       "        document.querySelector('#df-60d1bb38-f4ee-4f37-a97f-b87117a57a0e button.colab-df-convert');\n",
       "      buttonEl.style.display =\n",
       "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
       "\n",
       "      async function convertToInteractive(key) {\n",
       "        const element = document.querySelector('#df-60d1bb38-f4ee-4f37-a97f-b87117a57a0e');\n",
       "        const dataTable =\n",
       "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
       "                                                    [key], {});\n",
       "        if (!dataTable) return;\n",
       "\n",
       "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
       "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
       "          + ' to learn more about interactive tables.';\n",
       "        element.innerHTML = '';\n",
       "        dataTable['output_type'] = 'display_data';\n",
       "        await google.colab.output.renderOutput(dataTable, element);\n",
       "        const docLink = document.createElement('div');\n",
       "        docLink.innerHTML = docLinkHtml;\n",
       "        element.appendChild(docLink);\n",
       "      }\n",
       "    </script>\n",
       "  </div>\n",
       "\n",
       "\n",
       "    <div id=\"df-61a5556b-26d8-4877-878f-0d2e37b89ef9\">\n",
       "      <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-61a5556b-26d8-4877-878f-0d2e37b89ef9')\"\n",
       "                title=\"Suggest charts\"\n",
       "                style=\"display:none;\">\n",
       "\n",
       "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
       "     width=\"24px\">\n",
       "    <g>\n",
       "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
       "    </g>\n",
       "</svg>\n",
       "      </button>\n",
       "\n",
       "<style>\n",
       "  .colab-df-quickchart {\n",
       "      --bg-color: #E8F0FE;\n",
       "      --fill-color: #1967D2;\n",
       "      --hover-bg-color: #E2EBFA;\n",
       "      --hover-fill-color: #174EA6;\n",
       "      --disabled-fill-color: #AAA;\n",
       "      --disabled-bg-color: #DDD;\n",
       "  }\n",
       "\n",
       "  [theme=dark] .colab-df-quickchart {\n",
       "      --bg-color: #3B4455;\n",
       "      --fill-color: #D2E3FC;\n",
       "      --hover-bg-color: #434B5C;\n",
       "      --hover-fill-color: #FFFFFF;\n",
       "      --disabled-bg-color: #3B4455;\n",
       "      --disabled-fill-color: #666;\n",
       "  }\n",
       "\n",
       "  .colab-df-quickchart {\n",
       "    background-color: var(--bg-color);\n",
       "    border: none;\n",
       "    border-radius: 50%;\n",
       "    cursor: pointer;\n",
       "    display: none;\n",
       "    fill: var(--fill-color);\n",
       "    height: 32px;\n",
       "    padding: 0;\n",
       "    width: 32px;\n",
       "  }\n",
       "\n",
       "  .colab-df-quickchart:hover {\n",
       "    background-color: var(--hover-bg-color);\n",
       "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
       "    fill: var(--button-hover-fill-color);\n",
       "  }\n",
       "\n",
       "  .colab-df-quickchart-complete:disabled,\n",
       "  .colab-df-quickchart-complete:disabled:hover {\n",
       "    background-color: var(--disabled-bg-color);\n",
       "    fill: var(--disabled-fill-color);\n",
       "    box-shadow: none;\n",
       "  }\n",
       "\n",
       "  .colab-df-spinner {\n",
       "    border: 2px solid var(--fill-color);\n",
       "    border-color: transparent;\n",
       "    border-bottom-color: var(--fill-color);\n",
       "    animation:\n",
       "      spin 1s steps(1) infinite;\n",
       "  }\n",
       "\n",
       "  @keyframes spin {\n",
       "    0% {\n",
       "      border-color: transparent;\n",
       "      border-bottom-color: var(--fill-color);\n",
       "      border-left-color: var(--fill-color);\n",
       "    }\n",
       "    20% {\n",
       "      border-color: transparent;\n",
       "      border-left-color: var(--fill-color);\n",
       "      border-top-color: var(--fill-color);\n",
       "    }\n",
       "    30% {\n",
       "      border-color: transparent;\n",
       "      border-left-color: var(--fill-color);\n",
       "      border-top-color: var(--fill-color);\n",
       "      border-right-color: var(--fill-color);\n",
       "    }\n",
       "    40% {\n",
       "      border-color: transparent;\n",
       "      border-right-color: var(--fill-color);\n",
       "      border-top-color: var(--fill-color);\n",
       "    }\n",
       "    60% {\n",
       "      border-color: transparent;\n",
       "      border-right-color: var(--fill-color);\n",
       "    }\n",
       "    80% {\n",
       "      border-color: transparent;\n",
       "      border-right-color: var(--fill-color);\n",
       "      border-bottom-color: var(--fill-color);\n",
       "    }\n",
       "    90% {\n",
       "      border-color: transparent;\n",
       "      border-bottom-color: var(--fill-color);\n",
       "    }\n",
       "  }\n",
       "</style>\n",
       "\n",
       "      <script>\n",
       "        async function quickchart(key) {\n",
       "          const quickchartButtonEl =\n",
       "            document.querySelector('#' + key + ' button');\n",
       "          quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
       "          quickchartButtonEl.classList.add('colab-df-spinner');\n",
       "          try {\n",
       "            const charts = await google.colab.kernel.invokeFunction(\n",
       "                'suggestCharts', [key], {});\n",
       "          } catch (error) {\n",
       "            console.error('Error during call to suggestCharts:', error);\n",
       "          }\n",
       "          quickchartButtonEl.classList.remove('colab-df-spinner');\n",
       "          quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
       "        }\n",
       "        (() => {\n",
       "          let quickchartButtonEl =\n",
       "            document.querySelector('#df-61a5556b-26d8-4877-878f-0d2e37b89ef9 button');\n",
       "          quickchartButtonEl.style.display =\n",
       "            google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
       "        })();\n",
       "      </script>\n",
       "    </div>\n",
       "\n",
       "    </div>\n",
       "  </div>\n"
      ],
      "text/plain": [
       "                                               vector  \\\n",
       "0   [-0.027824033, -0.00014784517, 0.012952094, -0...   \n",
       "1   [0.008277873, 0.023134368, -0.0037977214, -0.0...   \n",
       "4   [-0.00843631, -0.013696598, 0.0075827544, -0.0...   \n",
       "5   [-0.03584866, -0.00017609358, 0.00922457, -0.0...   \n",
       "7   [-0.012233193, -0.0030392595, 0.022818677, -0....   \n",
       "10  [-0.027341157, -0.0001916921, 0.017680977, -0....   \n",
       "12  [-0.02901112, -0.0019353012, 0.010076279, -0.0...   \n",
       "\n",
       "                                                 text  \\\n",
       "0   In this paper a novel approach is proposed to ...   \n",
       "1   This paper introduces a new compositional fram...   \n",
       "4   Iron, the most ubiquitous of the transition me...   \n",
       "5   We introduce a long short-term memory recurren...   \n",
       "7   Sociological and technical difficulties, such ...   \n",
       "10  We introduce a technique for augmenting neural...   \n",
       "12  Classifying short texts to one category or clu...   \n",
       "\n",
       "                                                title  num_words  \\\n",
       "0   Text mining of news-headlines for FOREX market...        621   \n",
       "1   Dissecting and Reassembling Color Correction A...        237   \n",
       "4   Synthesis, properties, and applications of iro...        158   \n",
       "5   Speech-driven 3 D Facial Animation with Implic...        171   \n",
       "7   Hipikat: a project memory for software develop...        210   \n",
       "10  Deep Voice 2 : Multi-Speaker Neural Text-to-Sp...        151   \n",
       "12   Using deep learning for short text understanding        187   \n",
       "\n",
       "    _relevance_score  \n",
       "0           1.000000  \n",
       "1           0.902053  \n",
       "4           0.705130  \n",
       "5           0.701210  \n",
       "7           0.700000  \n",
       "10          0.489522  \n",
       "12          0.000000  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from typing import List, Union\n",
    "import pandas as pd\n",
    "import pyarrow as pa\n",
    "\n",
    "\n",
    "class MofidifiedLinearReranker(LinearCombinationReranker):\n",
    "    def __init__(self, filters: Union[str, List[str]], **kwargs):\n",
    "        super().__init__(**kwargs)\n",
    "        filters = filters if isinstance(filters, list) else [filters]\n",
    "        self.filters = filters\n",
    "\n",
    "    def rerank_hybrid(\n",
    "        self, query: str, vector_results: pa.Table, fts_results: pa.Table\n",
    "    ) -> pa.Table:\n",
    "        combined_result = super().rerank_hybrid(query, vector_results, fts_results)\n",
    "        df = combined_result.to_pandas()\n",
    "        for filter in self.filters:\n",
    "            df = df.query(\n",
    "                \"(not text.str.contains(@filter)) & (num_words > 150) \"\n",
    "            )  # THIS is where you implement your filters. You can hard code or pass dynamically too\n",
    "\n",
    "        return pa.Table.from_pandas(df)\n",
    "\n",
    "\n",
    "modified_reranker = MofidifiedLinearReranker(filters=[\"dual-band\"])\n",
    "\n",
    "table.search(\n",
    "    \"To confuse the AI and DNN embedding, let's put random terms from other sentences- automation training test memory?\",\n",
    "    query_type=\"hybrid\",\n",
    ").rerank(reranker=modified_reranker).limit(7).to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "g_yb6WAqLQwU"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "gpuType": "T4",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "name": "python3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
